import scrapy
from douban.items import DoubanItem
from scrapy import Request
import re

class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['douban.com']
    start_urls = ['https://book.douban.com/top250?start=0']
    url_sets = set()

    def parse(self, response):
        if response.url.startswith('https://book.douban.com'):
            
            seclectors = response.xpath('//div[@class="indent"]/table')

            for seclector in seclectors:
                item = DoubanItem()
                item['title'] = seclector.xpath("./tr/td[2]/div/a/@title").extract()[0]
                item['img_src'] = seclector.xpath("./tr/td[1]/a/img/@src").extract()[0]
                item['publish_detail'] = seclector.xpath("./tr/td[2]/p[@class='pl']/text()").extract()[0]
                item['score'] = seclector.xpath("./tr/td[2]/div[@class='star clearfix']/span[@class='rating_nums']/text()").extract()[0]
                item['detail_addr'] = seclector.xpath("./tr/td[2]/div[@class='pl2']/a/@href").extract()[0]
                
                #标语爬取
                item['slogan'] = seclector.xpath("./tr/td[2]/p[@class='quote']/span/text()").get()
                if item['slogan']:
                    pass
                else: 
                    item['slogan'] = '暂无信息'

                #评论人数爬取
                item['comment_people'] = seclector.xpath("./tr/td[2]/div[@class='star clearfix']/span[@class='pl']/text()").extract()[0]
                comment = str(item['comment_people'])
                rex = '[0-9]+'
                num = re.findall(rex, comment)
                if num:
                    item['comment_people'] = num[0]
                else:
                    item['comment_people'] = '暂无信息'

                #出版社爬取
                t = str(item['publish_detail'])
                s = t.encode()                          #进行字符串转义
                temp = s.decode('utf-8')
                pattern="[\u4e00-\u9fa5]*\u51fa\u7248[\u4e00-\u9fa5]*|[\u4e00-\u9fa5]*\u4E66\u5E97[\u4e00-\u9fa5]*"              #中文 出版 正则表达式
                results = re.findall(pattern, temp)          #匹配
                if results:
                    item['publish_house'] = results[0]
                else:
                    item['publish_house'] = '暂无信息'
                yield item


        #分页处理
        urls = response.xpath("//div[@class='paginator']/span[@class='next']/a/@href").extract()

        for url in urls:
            if url.startswith('https://book.douban.com'):
                if url in self.url_sets:
                    pass
                else:
                    self.url_sets.add(url)
                    yield self.make_requests_from_url(url)
            else:
                pass